image_path = "supermarket.jpeg"
display(Image(filename=image_path))
# for inserting images
from IPython.display import Image
from IPython.display import display
# to load the datasets
import numpy as np
import pandas as pd
# Visualisation
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.io as pio
pio.renderers.default = 'notebook'
# Data integration
import nltk
nltk.download('stopwords')
nltk.download('punkt')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
# For Vectorization
from sklearn.feature_extraction.text import TfidfVectorizer
# For Hadoop Mapreduce
import sys
from nltk.tokenize import word_tokenize
# For Pyspark
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, DoubleType, IntegerType, DateType
from pyspark.sql.functions import when, col
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression
from pyspark.sql import functions as F
from pyspark.sql.functions import col
from pyspark.ml.feature import Imputer
[nltk_data] Downloading package stopwords to [nltk_data] /Users/pritika_timsina/nltk_data... [nltk_data] Package stopwords is already up-to-date! [nltk_data] Downloading package punkt to [nltk_data] /Users/pritika_timsina/nltk_data... [nltk_data] Package punkt is already up-to-date!
supermarkt = pd.read_csv('supermarket_sales.csv')
supermarkt.head(10)
| Invoice ID | Branch | City | Customer type | Gender | Product line | Unit price | Quantity | Tax 5% | Total | Date | Time | Payment | cogs | gross margin percentage | gross income | Rating | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 750-67-8428 | A | Yangon | Member | Female | Health and beauty | 74.69 | 7 | 26.1415 | 548.9715 | 1/5/2019 | 13:08 | Ewallet | 522.83 | 4.761905 | 26.1415 | 9.1 |
| 1 | 226-31-3081 | C | Naypyitaw | Normal | Female | Electronic accessories | 15.28 | 5 | 3.8200 | 80.2200 | 3/8/2019 | 10:29 | Cash | 76.40 | 4.761905 | 3.8200 | 9.6 |
| 2 | 631-41-3108 | A | Yangon | Normal | Male | Home and lifestyle | 46.33 | 7 | 16.2155 | 340.5255 | 3/3/2019 | 13:23 | Credit card | 324.31 | 4.761905 | 16.2155 | 7.4 |
| 3 | 123-19-1176 | A | Yangon | Member | Male | Health and beauty | 58.22 | 8 | 23.2880 | 489.0480 | 1/27/2019 | 20:33 | Ewallet | 465.76 | 4.761905 | 23.2880 | 8.4 |
| 4 | 373-73-7910 | A | Yangon | Normal | Male | Sports and travel | 86.31 | 7 | 30.2085 | 634.3785 | 2/8/2019 | 10:37 | Ewallet | 604.17 | 4.761905 | 30.2085 | 5.3 |
| 5 | 699-14-3026 | C | Naypyitaw | Normal | Male | Electronic accessories | 85.39 | 7 | 29.8865 | 627.6165 | 3/25/2019 | 18:30 | Ewallet | 597.73 | 4.761905 | 29.8865 | 4.1 |
| 6 | 355-53-5943 | A | Yangon | Member | Female | Electronic accessories | 68.84 | 6 | 20.6520 | 433.6920 | 2/25/2019 | 14:36 | Ewallet | 413.04 | 4.761905 | 20.6520 | 5.8 |
| 7 | 315-22-5665 | C | Naypyitaw | Normal | Female | Home and lifestyle | 73.56 | 10 | 36.7800 | 772.3800 | 2/24/2019 | 11:38 | Ewallet | 735.60 | 4.761905 | 36.7800 | 8.0 |
| 8 | 665-32-9167 | A | Yangon | Member | Female | Health and beauty | 36.26 | 2 | 3.6260 | 76.1460 | 1/10/2019 | 17:15 | Credit card | 72.52 | 4.761905 | 3.6260 | 7.2 |
| 9 | 692-92-5582 | B | Mandalay | Member | Female | Food and beverages | 54.84 | 3 | 8.2260 | 172.7460 | 2/20/2019 | 13:27 | Credit card | 164.52 | 4.761905 | 8.2260 | 5.9 |
supermarkt.describe()
| Unit price | Quantity | Tax 5% | Total | cogs | gross margin percentage | gross income | Rating | |
|---|---|---|---|---|---|---|---|---|
| count | 1000.000000 | 1000.000000 | 1000.000000 | 1000.000000 | 1000.00000 | 1.000000e+03 | 1000.000000 | 1000.00000 |
| mean | 55.672130 | 5.510000 | 15.379369 | 322.966749 | 307.58738 | 4.761905e+00 | 15.379369 | 6.97270 |
| std | 26.494628 | 2.923431 | 11.708825 | 245.885335 | 234.17651 | 6.131498e-14 | 11.708825 | 1.71858 |
| min | 10.080000 | 1.000000 | 0.508500 | 10.678500 | 10.17000 | 4.761905e+00 | 0.508500 | 4.00000 |
| 25% | 32.875000 | 3.000000 | 5.924875 | 124.422375 | 118.49750 | 4.761905e+00 | 5.924875 | 5.50000 |
| 50% | 55.230000 | 5.000000 | 12.088000 | 253.848000 | 241.76000 | 4.761905e+00 | 12.088000 | 7.00000 |
| 75% | 77.935000 | 8.000000 | 22.445250 | 471.350250 | 448.90500 | 4.761905e+00 | 22.445250 | 8.50000 |
| max | 99.960000 | 10.000000 | 49.650000 | 1042.650000 | 993.00000 | 4.761905e+00 | 49.650000 | 10.00000 |
#listing out the columns
list(supermarkt.columns)
['Invoice ID', 'Branch', 'City', 'Customer type', 'Gender', 'Product line', 'Unit price', 'Quantity', 'Tax 5%', 'Total', 'Date', 'Time', 'Payment', 'cogs', 'gross margin percentage', 'gross income', 'Rating']
#dropping unnecessary columns for data accuracy
supermarkt.drop(['Invoice ID', 'Unit price', 'Tax 5%', 'Date', 'Time','cogs','gross margin percentage'],
axis=1, inplace=True)
# Data shape
print(supermarkt.shape)
(1000, 10)
# Bar Chart
branch_sales = supermarkt.groupby('Branch')['Total'].sum().reset_index()
colors = ['#1f77b4']
plt.figure(figsize=(8, 6))
plt.bar(branch_sales['Branch'], branch_sales['Total'], color=colors)
plt.title('Branch-wise Sales Comparison')
plt.xlabel('Branch')
plt.ylabel('Total Sales')
plt.show()
# the figure below shows that Branch C has the highest sales compared to the other two branches but the difference
# isn't that significantly big or small. It is all above 100k.Branch C has of sales of over 110k while branch A and
# have sales just above 106k
# Correlation Matrix for the total quantity sold, total amount, income and rating
corr_matrix = supermarkt[['Quantity', 'Total', 'gross income', 'Rating']].corr()
plt.figure(figsize=(8, 6))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', linewidths=0.5)
plt.title('Correlation Heatmap')
plt.show()
#the correlation matrix shows the correlation or the relationship between the variables,
#rating has a negative correlation with the other variables and others have a positive correlation with each other.
# Mapping Words
# The cities column in the dataset.
cities = ['Yangon', 'Naypyitaw', 'Yangon', 'Mandalay', 'Yangon']
city_mapping = {
'Yangon': 'City A',
'Naypyitaw': 'City B',
'Mandalay': 'City C',
}
# Using dictionary
mapped_city = [city_mapping[city] for city in cities]
print(mapped_city)
['City A', 'City B', 'City A', 'City C', 'City A']
# Schema Mapping
# Pre defining the schemas
schemas = {
'Branch': supermarkt['Branch'],
'City': supermarkt['City'],
'Customer Types': supermarkt['Customer type'],
'Gender': supermarkt['Gender'],
'Product Lines': supermarkt['Product line'],
'Quantity': supermarkt['Quantity'],
'Total made': supermarkt['Total'],
'Payment Methods': supermarkt['Payment'],
'Gross Income': supermarkt['gross income'],
'Customer Rating': supermarkt['Rating']
}
# Creating a new data frame with schema
new_df = pd.DataFrame(schemas)
print(new_df)
Branch City Customer Types Gender Product Lines \
0 A Yangon Member Female Health and beauty
1 C Naypyitaw Normal Female Electronic accessories
2 A Yangon Normal Male Home and lifestyle
3 A Yangon Member Male Health and beauty
4 A Yangon Normal Male Sports and travel
.. ... ... ... ... ...
995 C Naypyitaw Normal Male Health and beauty
996 B Mandalay Normal Female Home and lifestyle
997 A Yangon Member Male Food and beverages
998 A Yangon Normal Male Home and lifestyle
999 A Yangon Member Female Fashion accessories
Quantity Total made Payment Methods Gross Income Customer Rating
0 7 548.9715 Ewallet 26.1415 9.1
1 5 80.2200 Cash 3.8200 9.6
2 7 340.5255 Credit card 16.2155 7.4
3 8 489.0480 Ewallet 23.2880 8.4
4 7 634.3785 Ewallet 30.2085 5.3
.. ... ... ... ... ...
995 1 42.3675 Ewallet 2.0175 6.2
996 10 1022.4900 Ewallet 48.6900 4.4
997 1 33.4320 Cash 1.5920 7.7
998 1 69.1110 Cash 3.2910 4.1
999 7 649.2990 Cash 30.9190 6.6
[1000 rows x 10 columns]
# Data Tranfromation
#Tokanizing my dataset, using lower case for words, removing stop words like "the", "a", "an", etc,if there are any
# and stemming the values
def process_thetext(texts):
token_z = word_tokenize(texts)
token_z = [token.lower() for token in token_z]
stop_word = set(stopwords.words('english'))
filtered_token_z = [token for token in token_z if token not in stop_word]
stemmers = PorterStemmer()
stemmed_token = [stemmers.stem(token) for token in filtered_token_z]
return ' '.join(stemmed_token)
# Preprocessing for product line in the dataset
supermarkt['Processed Product Lines'] = supermarkt['Product line'].apply(process_thetext)
# TF-IDF Vectorization
tfidf_vector = TfidfVectorizer(max_features=1000)
tfidf_matrixversion = tfidf_vector.fit_transform(supermarkt['Processed Product Lines'])
print(tfidf_matrixversion)
(0, 1) 0.7071067811865476 (0, 6) 0.7071067811865476 (1, 0) 0.5959741331214485 (1, 3) 0.8030036317789215 (2, 8) 0.7071067811865475 (2, 7) 0.7071067811865475 (3, 1) 0.7071067811865476 (3, 6) 0.7071067811865476 (4, 10) 0.7071067811865475 (4, 9) 0.7071067811865475 (5, 0) 0.5959741331214485 (5, 3) 0.8030036317789215 (6, 0) 0.5959741331214485 (6, 3) 0.8030036317789215 (7, 8) 0.7071067811865475 (7, 7) 0.7071067811865475 (8, 1) 0.7071067811865476 (8, 6) 0.7071067811865476 (9, 2) 0.7071067811865475 (9, 5) 0.7071067811865475 (10, 4) 0.798214609195472 (10, 0) 0.6023731714368757 (11, 0) 0.5959741331214485 (11, 3) 0.8030036317789215 (12, 0) 0.5959741331214485 : : (987, 6) 0.7071067811865476 (988, 0) 0.5959741331214485 (988, 3) 0.8030036317789215 (989, 1) 0.7071067811865476 (989, 6) 0.7071067811865476 (990, 2) 0.7071067811865475 (990, 5) 0.7071067811865475 (991, 10) 0.7071067811865475 (991, 9) 0.7071067811865475 (992, 0) 0.5959741331214485 (992, 3) 0.8030036317789215 (993, 4) 0.798214609195472 (993, 0) 0.6023731714368757 (994, 0) 0.5959741331214485 (994, 3) 0.8030036317789215 (995, 1) 0.7071067811865476 (995, 6) 0.7071067811865476 (996, 8) 0.7071067811865475 (996, 7) 0.7071067811865475 (997, 2) 0.7071067811865475 (997, 5) 0.7071067811865475 (998, 8) 0.7071067811865475 (998, 7) 0.7071067811865475 (999, 4) 0.798214609195472 (999, 0) 0.6023731714368757
# Converting TF-IDF matrix version to a DataFrame for checking how it looks
tfidf_data = pd.DataFrame(tfidf_matrixversion.toarray(), columns=tfidf_vector.get_feature_names_out())
print(tfidf_data.head())
accessori beauti beverag electron fashion food health home \ 0 0.000000 0.707107 0.0 0.000000 0.0 0.0 0.707107 0.000000 1 0.595974 0.000000 0.0 0.803004 0.0 0.0 0.000000 0.000000 2 0.000000 0.000000 0.0 0.000000 0.0 0.0 0.000000 0.707107 3 0.000000 0.707107 0.0 0.000000 0.0 0.0 0.707107 0.000000 4 0.000000 0.000000 0.0 0.000000 0.0 0.0 0.000000 0.000000 lifestyl sport travel 0 0.000000 0.000000 0.000000 1 0.000000 0.000000 0.000000 2 0.707107 0.000000 0.000000 3 0.000000 0.000000 0.000000 4 0.000000 0.707107 0.707107
# Data Reconciliation
# Finding out the columns with null values
ss = supermarkt.isnull().sum()
print("Columns with missing values:")
print(ss[ss > 0])
# There is no null values in my dataset
Columns with missing values: Series([], dtype: int64)
# Finding out the duplicate values
duplicats = supermarkt.duplicated()
duplicaterow = supermarkt[duplicats]
print("Duplicate rows:")
print(duplicaterow)
# There are no duplicate values in my dataset
Duplicate rows: Empty DataFrame Columns: [Branch, City, Customer type, Gender, Product line, Quantity, Total, Payment, gross income, Rating, Processed Product Lines] Index: []
# codes for mapper.py and reducer.py that was used in Hadoop later
images = "mapper.png"
display(Image(filename=images))
image = "reducer.png"
display(Image(filename=image))
# Running Hadoop in the terminal codes
imag = "result-3.jpg"
display(Image(filename=imag))
img = "result-2.png"
display(Image(filename=img))
iam = "result-1.png"
display(Image(filename=iam))
iman = "result.png"
display(Image(filename=iman))
# snippet of the output I got in my new folder called outputex_new that I got after running the code in terminal.
# The _SUCCESS file was empty however the other folder had all the outputs which are listed below.
inn = "output.png"
display(Image(filename=inn))
iama = "output_1.png"
display(Image(filename=iama))
imana = "output_2.png"
display(Image(filename=imana))
# Operating the Pyspark session and loading the supermarket dataset in pyspark
sparks = SparkSession.builder.appName("SupermarktData").getOrCreate()
dataset = sparks.read.csv('supermarket_sales.csv', header=True, inferSchema=True)
dataset.show(3)
+-----------+------+---------+-------------+------+--------------------+----------+--------+-------+--------+--------+-------------------+-----------+------+-----------------------+------------+------+ | Invoice ID|Branch| City|Customer type|Gender| Product line|Unit price|Quantity| Tax 5%| Total| Date| Time| Payment| cogs|gross margin percentage|gross income|Rating| +-----------+------+---------+-------------+------+--------------------+----------+--------+-------+--------+--------+-------------------+-----------+------+-----------------------+------------+------+ |750-67-8428| A| Yangon| Member|Female| Health and beauty| 74.69| 7|26.1415|548.9715|1/5/2019|2023-09-15 13:08:00| Ewallet|522.83| 4.761904762| 26.1415| 9.1| |226-31-3081| C|Naypyitaw| Normal|Female|Electronic access...| 15.28| 5| 3.82| 80.22|3/8/2019|2023-09-15 10:29:00| Cash| 76.4| 4.761904762| 3.82| 9.6| |631-41-3108| A| Yangon| Normal| Male| Home and lifestyle| 46.33| 7|16.2155|340.5255|3/3/2019|2023-09-15 13:23:00|Credit card|324.31| 4.761904762| 16.2155| 7.4| +-----------+------+---------+-------------+------+--------------------+----------+--------+-------+--------+--------+-------------------+-----------+------+-----------------------+------------+------+ only showing top 3 rows
# Data grroupby
products_acc_city= dataset.groupBy('City', 'Product line').count()
products_acc_city.show(10)
+---------+--------------------+-----+ | City| Product line|count| +---------+--------------------+-----+ |Naypyitaw|Electronic access...| 55| | Mandalay| Health and beauty| 53| |Naypyitaw| Sports and travel| 45| | Yangon|Electronic access...| 60| | Mandalay|Electronic access...| 55| | Mandalay| Fashion accessories| 62| |Naypyitaw| Food and beverages| 66| |Naypyitaw| Fashion accessories| 65| | Yangon| Sports and travel| 59| | Yangon| Home and lifestyle| 65| +---------+--------------------+-----+ only showing top 10 rows
# Vector transformation
assembly = VectorAssembler(inputCols=['Quantity'], outputCol='features')
vector_data = assembly.transform(dataset)
# Linear regression
linear = LinearRegression(featuresCol='features', labelCol='Total')
fitting = linear.fit(vector_data)
23/09/14 23:18:40 WARN Instrumentation: [fb243581] regParam is zero, which might cause numerical instability and overfitting.
# Prediction for sales
pred = fitting.transform(vector_data)
pred.select('Quantity', 'Total', 'prediction').show()
+--------+--------+------------------+ |Quantity| Total| prediction| +--------+--------+------------------+ | 7|548.9715|411.38245041895516| | 5| 80.22| 292.7036565680088| | 7|340.5255|411.38245041895516| | 8| 489.048| 470.7218473444284| | 7|634.3785|411.38245041895516| | 7|627.6165|411.38245041895516| | 6| 433.692| 352.043053493482| | 10| 772.38| 589.4006411953749| | 2| 76.146|114.68546579158914| | 3| 172.746|174.02486271706235| | 4| 60.816|233.36425964253556| | 4| 107.142|233.36425964253556| | 5|246.4875| 292.7036565680088| | 10| 453.495| 589.4006411953749| | 10| 749.49| 589.4006411953749| | 6| 590.436| 352.043053493482| | 7|506.6355|411.38245041895516| | 6| 457.443| 352.043053493482| | 3|172.2105|174.02486271706235| | 2| 84.63|114.68546579158914| +--------+--------+------------------+ only showing top 20 rows
#Pyspark Visualisation
#Hostogram for Unit Price
spark = SparkSession.builder.appName("DataVisualization").getOrCreate()
dataset = spark.read.csv('supermarket_sales.csv', header=True, inferSchema=True)
unit_price_data = dataset.select('Unit price').dropna() # Drop rows with missing values
unit_price_values = unit_price_data.rdd.flatMap(lambda x: x).collect() # Collect the data for plotting
plt.hist(unit_price_values, bins=20, color='gray', edgecolor='black')
plt.xlabel('Unit Price')
plt.ylabel('Frequency')
plt.title('Histogram of Unit Price')
plt.show()
# Pie Chart for Male and Female Customers
gender_counts = dataset.groupBy('Gender').count().toPandas()
plt.pie(gender_counts['count'], labels=gender_counts['Gender'], autopct='%1.1f%%', colors=['lightpink', 'lightblue'])
plt.title('Pie Chart of Gender Distribution')
plt.axis('equal')
plt.show()
# Stopping the pyspark session
spark.stop()
Apache (no date) Apache/Hadoop: Apache hadoop, GitHub. Available at: https://github.com/apache/hadoop (Accessed: 15 September 2023).
| cite this for me (no date) Cite This For Me, a Chegg service. Available at: https://www.citethisforme.com/bibliographies/90d022d9-c2cc-4c9f-8547-f2421cabd532 (Accessed: 15 September 2023).
Seahboonsiew (no date) Seahboonsiew/pyspark-CSV: An external pyspark module that works like R’s read.csv or Panda’s read_csv, with automatic type inference and null value handling. parses CSV data into SCHEMARDD. no installation required, simply include pyspark_csv.py via SparkContext., GitHub. Available at: https://github.com/seahboonsiew/pyspark-csv (Accessed: 15 September 2023).
Learn (no date) scikit. Available at: https://scikit-learn.org/stable/ (Accessed: 15 September 2023).
Matplotlib (no date) Matplotlib/mplfinance: Financial Markets Data Visualization using matplotlib, GitHub. Available at: https://github.com/matplotlib/mplfinance (Accessed: 17 August 2023).
OpenAI. (2023). ChatGPT (August 3 Version) [Large language model].
PySpark overview¶ (no date) PySpark Overview - PySpark 3.5.0 documentation. Available at: https://spark.apache.org/docs/latest/api/python/index.html (Accessed: 15 September 2023).
Low-code data app development (no date) Plotly. Available at: https://plotly.com/ (Accessed: 15 September 2023).
Sj (2022) Supermarket store branches sales analysis, Kaggle. Available at: https://www.kaggle.com/datasets/surajjha101/stores-area-and-sales-data (Accessed: 15 September 2023).
What is HDFS? Apache Hadoop Distributed File System (no date) IBM. Available at: https://www.ibm.com/topics/hdfs#:~:text=HDFS%20is%20a%20distributed%20file,others%20being%20MapReduce%20and%20YARN. (Accessed: 15 September 2023).
What is Hadoop and what is it used for? | google cloud (no date) Google. Available at: https://cloud.google.com/learn/what-is-hadoop (Accessed: 11 September 2023).
Unsplash (no date) Beautiful free images & pictures, Unsplash. Available at: https://unsplash.com/ (Accessed: 20 August 2023).
Visualization with python (no date) Matplotlib. Available at: https://matplotlib.org/ (Accessed: 1 Septmeber 2023).
Your machine learning and Data Science Community (no date) Kaggle. Available at: https://www.kaggle.com/ (Accessed: 15 September 2023).
Spark-Examples (no date) Spark-examples/pyspark-examples: Pyspark RDD, DataFrame and dataset examples in Python language, GitHub. Available at: https://github.com/spark-examples/pyspark-examples (Accessed: 15 September 2023).
Budnack, J.R. (2021) Start working with HDFS from the command line, Stark & Wayne. Available at: https://www.starkandwayne.com/blog/working-with-hdfs/index.html (Accessed: 15 September 2023).
ime_path = "Assessmentform.png"
display(Image(filename=ime_path))